#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')

library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(plotlyutils)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally)
library(Rtsne)
library(ClusterR)
library(DESeq2)
library(expss)
library(knitr)

Load preprocessed dataset (preprocessing code in 20_03_02_data_preprocessing.Rmd)

# Gandal dataset
load('./../Data/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame

# GO Neuronal annotations: regex 'neuron' in GO functional annotations and label the genes that make a match as neuronal
GO_annotations = read.csv('./../Data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
              mutate('ID'=as.character(ensembl_gene_id)) %>% 
              dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
              mutate('Neuronal'=1)

# SFARI Genes
SFARI_genes = read_csv('./../../../SFARI/Data/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]

# Update DE_info with SFARI and Neuronal information
DE_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
  mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
  distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
  mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
  mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`), significant=padj<0.05 & !is.na(padj))


SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

SFARI Gene list

cat(paste0('There are ', length(unique(SFARI_genes$`gene-symbol`)), ' genes with a SFARI score'))
## There are 979 genes with a SFARI score

The results from this section don’t change depending on the brain region analised, so I’m not going to repeat them here. These can be found in the folder where all the brain regions were analysed together


Exploratory Analysis

As in the previous section, the results from this section don’t change depending on the brain region analised, so I’m not going to repeat them here. These can be found in the folder where all the brain regions were analysed together


Gene Expression


Normalised data

  • The higher the SFARI score, the higher the mean expression of the gene: This pattern is quite strong and it doesn’t have any biological interpretation, so it’s probably bias in the SFARI score assignment

  • The higher the SFARI score, the lower the standard deviation: This pattern is not as strong, but it is weird because the data was originally heteroscedastic with a positive relation between mean and variance, so the fact that the relation now seems to have reversed could mean that the vst normalisation ended up affecting the highly expressed genes more than it should have when trying to correct their higher variance

plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>% 
            left_join(DE_info, by='ID')

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              ggtitle('Mean Expression (left) and SD (right) by SFARI score') + 
              theme(legend.position='none'))

subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)


Raw data

Just to corroborate that the relation between sd and SFARI score used to be in the opposite direction before the normalisation: The higher the SFARI score the higher the mean expression and the higher the standard deviation

*There are a lot of outliers, but the plot is interactive so you can zoom in

# Save preprocessed results
datExpr_prep = datExpr
datMeta_prep = datMeta
DE_info_prep = DE_info

load('./../Data/filtered_raw_data.RData')

plot_data = data.frame('ID'=rownames(datExpr), 'MeanExpr'=rowMeans(datExpr), 'SDExpr'=apply(datExpr,1,sd)) %>% 
            left_join(DE_info, by='ID')

p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, MeanExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              theme(legend.position='none'))

p2 = ggplotly(plot_data %>% ggplot(aes(gene.score, SDExpr, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + theme_minimal() +
              ggtitle('Mean Expression (left) and SD (right) by SFARI score') + 
              theme(legend.position='none'))

subplot(p1, p2, nrows=1)
rm(plot_data, p1, p2)

Return to normalised version of the data

# Save preprocessed results
datExpr = datExpr_prep
datMeta = datMeta_prep
DE_info = DE_info_prep

rm(datExpr_prep, datMeta_prep, DE_info_prep)


Log Fold Change

There seems to be a negative relation between SFARI score and log fold change when it would be expected to be either positively correlated or independent from each other (this last one because there are other factors that determine if a gene is releated to Autism apart from differences in gene expression)

Wikipedia mentions the likely explanation for this: “A disadvantage and serious risk of using fold change in this setting is that it is biased and may misclassify differentially expressed genes with large differences (B − A) but small ratios (B/A), leading to poor identification of changes at high expression levels”.

Based on this, since we saw there is a strong relation between SFARI score and mean expression, the bias in log fold change affects mainly genes with high SFARI scores, which would be the ones we are most interested in.

On top of this, I believe this effect is made more extreme by the pattern found in the previous plots, since the higher expressed genes were the most affected by the normalisation transformation, ending up with a smaller variance than the rest of the data, which is related to smaller ratios. (This is a constant problem independently of the normalisation function used).

ggplotly(DE_info %>% ggplot(aes(x=gene.score, y=abs(log2FoldChange), fill=gene.score)) + 
         geom_boxplot() + scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
         theme_minimal() + theme(legend.position='none'))


Effects of modifying filtering threshold by SFARI score

lfc_list = seq(1, 1.3, 0.01)

all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_info)))
Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_info$Neuronal)))
lfc_counts_all = DE_info %>% group_by(`gene-score`) %>% tally %>%
                 mutate('group'=as.factor(`gene-score`), 'n'=as.character(n)) %>%
                 dplyr::select(group, n) %>%
                 bind_rows(Neuronal_counts, all_counts) %>%
                 mutate('lfc'=-1) %>%  dplyr::select(lfc, group, n)

for(lfc in lfc_list){
  
  # Recalculate DE_info with the new threshold (p-values change)
  DE_genes = results(dds, lfcThreshold=log2(lfc), altHypothesis='greaterAbs') %>% data.frame
  
  DE_genes = DE_genes %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>% 
             mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
             distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
             mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
             mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`))
  
  DE_genes = DE_genes %>% filter(padj<0.05 & abs(log2FoldChange)>log2(lfc))

  
  # Calculate counts by groups
  all_counts = data.frame('group'='All', 'n'=as.character(nrow(DE_genes)))
  Neuronal_counts = data.frame('group'='Neuronal', n=as.character(sum(DE_genes$Neuronal)))
  lfc_counts = DE_genes %>% group_by(`gene-score`) %>% tally %>%
               mutate('group'=`gene-score`, 'n'=as.character(n)) %>%
               bind_rows(Neuronal_counts, all_counts) %>%
               mutate('lfc'=lfc) %>% dplyr::select(lfc, group, n)
  
  
  # Update lfc_counts_all
  lfc_counts_all = lfc_counts_all %>% bind_rows(lfc_counts)
}

# Add missing entries with 0s
lfc_counts_all = expand.grid('group'=unique(lfc_counts_all$group), 'lfc'=unique(lfc_counts_all$lfc)) %>% 
  left_join(lfc_counts_all, by=c('group','lfc')) %>% replace(is.na(.), 0)

# Calculate percentage of each group remaining
tot_counts = DE_info %>% group_by(`gene-score`) %>% tally() %>% filter(`gene-score`!='None') %>%
             mutate('group'=`gene-score`, 'tot'=n) %>% dplyr::select(group, tot) %>%
             bind_rows(data.frame('group'='Neuronal', 'tot'=sum(DE_info$Neuronal)),
                       data.frame('group'='All', 'tot'=nrow(DE_info)))

lfc_counts_all = lfc_counts_all %>% filter(lfc!=-1, group!='None') %>% 
                 left_join(tot_counts, by='group') %>% mutate('perc'=round(100*as.numeric(n)/tot,2))


# Plot change of number of genes
ggplotly(lfc_counts_all %>% ggplot(aes(lfc, perc, color=group)) + geom_point(aes(id=n)) + geom_line() + 
         scale_color_manual(values=SFARI_colour_hue(r=1:8)) + ylab('% of remaining genes') +  xlab('Fold Change') + 
         ggtitle('Effect of filtering thresholds by SFARI score') + theme_minimal())
rm(lfc_list, all_counts, Neuronal_counts, lfc_counts_all, lfc, lfc_counts, lfc_counts_all, tot_counts, lfc_counts_all)
cat(paste0('There are ', sum(DE_info$padj<0.05 & DE_info$`gene-score` != 'None' & !is.na(DE_info$padj)),
           ' SFARI genes that are differentially expressed'))
## There are 103 SFARI genes that are differentially expressed
kable(DE_info %>% filter(padj<0.05 & `gene-score` != 'None' & !is.na(padj)) %>% 
      dplyr::select(ID, `gene-symbol`, log2FoldChange, padj, `gene-score`, Neuronal) %>% arrange(padj))
ID gene-symbol log2FoldChange padj gene-score Neuronal
ENSG00000104725 NEFL -0.8599331 0.0000003 5 0
ENSG00000154263 ABCA10 -0.9381761 0.0000198 4 0
ENSG00000196876 SCN8A -0.6888414 0.0000242 3 1
ENSG00000178718 RPP25 -0.7894551 0.0000738 5 0
ENSG00000204466 DGKK -2.7574355 0.0001817 5 0
ENSG00000156463 SH3RF2 -1.0948308 0.0002298 5 0
ENSG00000171735 CAMTA1 -0.6697040 0.0002666 5 0
ENSG00000132639 SNAP25 -0.7360379 0.0004682 4 1
ENSG00000147402 GABRQ 1.7896069 0.0004998 5 0
ENSG00000159082 SYNJ1 -0.4543946 0.0005073 4 0
ENSG00000103528 SYT17 1.3095456 0.0006371 4 0
ENSG00000185666 SYN3 -0.6273865 0.0006371 5 0
ENSG00000163618 CADPS -0.3473908 0.0006514 4 0
ENSG00000172260 NEGR1 -0.5130772 0.0007719 4 1
ENSG00000144285 SCN1A -1.0373485 0.0009092 3 1
ENSG00000078328 RBFOX1 -0.5661002 0.0010367 3 0
ENSG00000147862 NFIB 0.5881471 0.0010367 4 1
ENSG00000169432 SCN9A 1.4664867 0.0010778 2 1
ENSG00000109911 ELP4 0.5266608 0.0014462 3 0
ENSG00000115159 GPD2 0.5808588 0.0014462 4 0
ENSG00000022355 GABRA1 -0.6276506 0.0014755 5 0
ENSG00000113657 DPYSL3 0.7158619 0.0014835 4 1
ENSG00000136535 TBR1 -0.8060293 0.0014835 1 1
ENSG00000197635 DPP4 -1.5566496 0.0014835 4 0
ENSG00000174996 KLC2 -0.5810603 0.0015643 5 1
ENSG00000118596 SLC16A7 -0.6039421 0.0016182 5 0
ENSG00000116117 PARD3B 0.6164087 0.0019351 3 0
ENSG00000116852 KIF21B 0.7024335 0.0022071 5 0
ENSG00000163288 GABRB1 0.7478894 0.0026413 5 1
ENSG00000164434 FABP7 1.1327051 0.0032564 6 1
ENSG00000079482 OPHN1 0.5014590 0.0035667 3 0
ENSG00000091831 ESR1 1.7251886 0.0036514 5 0
ENSG00000171385 KCND3 -0.3859777 0.0037243 4 1
ENSG00000129159 KCNC1 -0.6891124 0.0037845 4 1
ENSG00000164638 SLC29A4 0.8000841 0.0037845 4 0
ENSG00000147044 CASK 0.4032438 0.0038136 4 0
ENSG00000186297 GABRA5 0.9794783 0.0038665 5 1
ENSG00000169439 SDC2 0.7008742 0.0038697 4 1
ENSG00000117594 HSD11B1 -1.7954445 0.0040552 4 0
ENSG00000150275 PCDH15 0.9209430 0.0041941 4 0
ENSG00000170579 DLGAP1 -0.4625938 0.0049824 3 0
ENSG00000119335 SET 0.2649047 0.0050086 2 1
ENSG00000182985 CADM1 0.6356541 0.0052865 4 1
ENSG00000196557 CACNA1H 1.2060762 0.0072262 2 0
ENSG00000139734 DIAPH3 1.1809753 0.0075071 5 0
ENSG00000115840 SLC25A12 -0.5094898 0.0078328 4 0
ENSG00000174469 CNTNAP2 -0.3510615 0.0078328 2 1
ENSG00000170289 CNGB3 -2.5998568 0.0081129 4 0
ENSG00000196090 PTPRT -0.5059030 0.0092666 4 0
ENSG00000135631 RAB11FIP5 -0.5108900 0.0095890 4 0
ENSG00000129911 KLF16 0.5888674 0.0097244 4 0
ENSG00000106113 CRHR2 -1.1356099 0.0110483 5 0
ENSG00000074590 NUAK1 -0.4406348 0.0117834 3 0
ENSG00000105409 ATP1A3 -0.6151818 0.0125335 4 0
ENSG00000152208 GRID2 0.7948104 0.0125335 4 1
ENSG00000050628 PTGER3 1.6825331 0.0130087 5 0
ENSG00000065989 PDE4A -0.6149110 0.0131282 5 0
ENSG00000155886 SLC24A2 -0.6930743 0.0137060 4 0
ENSG00000183454 GRIN2A -0.4734245 0.0138584 3 1
ENSG00000149403 GRIK4 0.4796942 0.0147415 4 0
ENSG00000128739 SNRPN -0.5775820 0.0157125 5 0
ENSG00000080298 RFX3 0.3563531 0.0163146 4 0
ENSG00000144331 ZNF385B -0.6383473 0.0169365 4 0
ENSG00000150995 ITPR1 -0.4301551 0.0175876 4 0
ENSG00000189108 IL1RAPL2 0.6676671 0.0189764 4 0
ENSG00000183098 GPC6 0.6720113 0.0195965 4 0
ENSG00000187775 DNAH17 -0.6896587 0.0198271 4 0
ENSG00000144290 SLC4A10 -0.3420210 0.0203812 4 0
ENSG00000124140 SLC12A5 -0.4596906 0.0206587 3 1
ENSG00000131165 CHMP1A 0.2921451 0.0219020 3 0
ENSG00000153575 TUBGCP5 -0.5029916 0.0224110 4 0
ENSG00000126351 THRA 0.4338863 0.0244411 4 0
ENSG00000155511 GRIA1 0.4918262 0.0247583 2 1
ENSG00000165355 FBXO33 -0.5066870 0.0258730 4 0
ENSG00000177565 TBL1XR1 0.2398571 0.0260473 2 0
ENSG00000169862 CTNND2 0.2958835 0.0264162 2 1
ENSG00000170004 CHD3 0.4328755 0.0269678 4 0
ENSG00000136854 STXBP1 -0.5556416 0.0271662 3 1
ENSG00000157087 ATP2B2 -0.7063162 0.0277102 3 1
ENSG00000162599 NFIA 0.4213066 0.0279680 4 0
ENSG00000102882 MAPK3 0.6456731 0.0282559 4 0
ENSG00000106089 STX1A 0.4435688 0.0292019 4 1
ENSG00000034053 APBA2 -0.2156699 0.0292689 4 0
ENSG00000144406 UNC80 -0.3474040 0.0318819 4 0
ENSG00000215045 GRID2IP -0.9524291 0.0322223 4 0
ENSG00000005379 TSPOAP1 0.4988449 0.0322896 4 0
ENSG00000007171 NOS2 -0.7024062 0.0327215 5 0
ENSG00000172554 SNTG2 1.3347078 0.0328585 4 0
ENSG00000166147 FBN1 -0.3855763 0.0337599 3 0
ENSG00000107518 ATRNL1 -0.4331538 0.0349398 5 0
ENSG00000158089 GALNT14 0.4990843 0.0368939 4 0
ENSG00000102181 CD99L2 -0.3353899 0.0390800 4 0
ENSG00000157483 MYO1E -0.5058851 0.0401325 4 0
ENSG00000132294 EFR3A -0.3132308 0.0419853 3 0
ENSG00000105926 MPP6 0.3750514 0.0424932 4 0
ENSG00000109906 ZBTB16 -0.4032127 0.0430216 4 0
ENSG00000197977 ELOVL2 0.7903170 0.0430216 4 0
ENSG00000003147 ICA1 -0.5148752 0.0431301 3 0
ENSG00000167114 SLC27A4 -0.2648417 0.0433964 4 0
ENSG00000040731 CDH10 0.4288321 0.0464164 4 0
ENSG00000100362 PVALB -1.6906695 0.0477081 5 0
ENSG00000213023 SYT3 -0.4651952 0.0481690 6 0
ENSG00000172340 SUCLG2 0.5826945 0.0488796 6 0

Session info

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] knitr_1.23                  expss_0.10.1               
##  [3] DESeq2_1.24.0               SummarizedExperiment_1.14.1
##  [5] DelayedArray_0.10.0         BiocParallel_1.18.1        
##  [7] matrixStats_0.54.0          Biobase_2.44.0             
##  [9] GenomicRanges_1.36.0        GenomeInfoDb_1.20.0        
## [11] IRanges_2.18.2              S4Vectors_0.22.0           
## [13] BiocGenerics_0.30.0         ClusterR_1.2.1             
## [15] gtools_3.8.1                Rtsne_0.15                 
## [17] GGally_1.4.0                gridExtra_2.3              
## [19] viridis_0.5.1               viridisLite_0.3.0          
## [21] RColorBrewer_1.1-2          plotlyutils_0.0.0.9000     
## [23] plotly_4.9.1                glue_1.3.1                 
## [25] reshape2_1.4.3              forcats_0.4.0              
## [27] stringr_1.4.0               dplyr_0.8.2                
## [29] purrr_0.3.3                 readr_1.3.1                
## [31] tidyr_0.8.3                 tibble_2.1.3               
## [33] ggplot2_3.2.0               tidyverse_1.2.1            
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.4-1       htmlTable_1.13.1       XVector_0.24.0        
##  [4] base64enc_0.1-3        rstudioapi_0.10        bit64_0.9-7           
##  [7] AnnotationDbi_1.46.1   lubridate_1.7.4        xml2_1.2.2            
## [10] splines_3.6.2          geneplotter_1.62.0     zeallot_0.1.0         
## [13] Formula_1.2-3          jsonlite_1.6           broom_0.5.2           
## [16] annotate_1.62.0        cluster_2.1.0          shiny_1.3.2           
## [19] compiler_3.6.2         httr_1.4.0             backports_1.1.4       
## [22] assertthat_0.2.1       Matrix_1.2-18          lazyeval_0.2.2        
## [25] cli_1.1.0              later_0.8.0            acepack_1.4.1         
## [28] htmltools_0.3.6        tools_3.6.2            gmp_0.5-13.5          
## [31] gtable_0.3.0           GenomeInfoDbData_1.2.1 Rcpp_1.0.1            
## [34] cellranger_1.1.0       vctrs_0.2.0            nlme_3.1-144          
## [37] crosstalk_1.0.0        xfun_0.8               rvest_0.3.4           
## [40] mime_0.7               XML_3.98-1.20          zlibbioc_1.30.0       
## [43] scales_1.0.0           promises_1.0.1         hms_0.5.1             
## [46] yaml_2.2.0             memoise_1.1.0          rpart_4.1-15          
## [49] reshape_0.8.8          latticeExtra_0.6-28    stringi_1.4.3         
## [52] RSQLite_2.1.2          highr_0.8              genefilter_1.66.0     
## [55] checkmate_1.9.4        rlang_0.4.2            pkgconfig_2.0.2       
## [58] bitops_1.0-6           evaluate_0.14          lattice_0.20-40       
## [61] labeling_0.3           htmlwidgets_1.3        bit_1.1-14            
## [64] tidyselect_0.2.5       plyr_1.8.4             magrittr_1.5          
## [67] R6_2.4.0               generics_0.0.2         Hmisc_4.2-0           
## [70] DBI_1.0.0              pillar_1.4.2           haven_2.1.1           
## [73] foreign_0.8-75         withr_2.1.2            survival_3.1-8        
## [76] RCurl_1.95-4.12        nnet_7.3-12            modelr_0.1.5          
## [79] crayon_1.3.4           rmarkdown_1.13         locfit_1.5-9.1        
## [82] grid_3.6.2             readxl_1.3.1           data.table_1.12.2     
## [85] blob_1.2.0             digest_0.6.19          xtable_1.8-4          
## [88] httpuv_1.5.1           munsell_0.5.0